Use output from Humann2 to assess the gene families, pathway abundance, and pathway coverage, assigned to Gardnerella vaginalis across samples.
library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggrepel)
Metadata
sgDF <- read_delim("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/shotgunMetadata.tsv", delim = "\t")%>%
mutate(SampleID=as.character(SampleID))
## Parsed with column specification:
## cols(
## .default = col_double(),
## SampleIndex = col_character(),
## SampleType = col_character(),
## TermPre = col_character(),
## Operator = col_character(),
## PctTrim = col_character(),
## RNAlater = col_character(),
## Cohort = col_character()
## )
## See spec(...) for full column specifications.
load("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/methods_comparisons/PS_RData/metaphlan_PS.RData")
Gardnerella abundance table
#Clade assignments
gardRelativeAbundance <- read_delim("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/gard_mapping/clade_assignments/gardRelativeAbundance.tsv", delim = "\t") %>%
mutate(SampleID=as.character(SampleID),
Clade=as.character(Clade))
## Parsed with column specification:
## cols(
## SampleID = col_double(),
## Clade = col_double(),
## n = col_double(),
## propClade = col_double()
## )
SampleIDlist <- rep(unique(sgDF$SampleID), 6) %>%
.[order(.)]
metaphlanGard <- PS_metaphlan_all@otu_table %>%
t() %>%
as.data.frame() %>%
mutate(SampleID=rownames(.)) %>%
select(Gardnerella_vaginalis, SampleID)
## Loading required package: phyloseq
cladeFrame <- tibble(SampleID=SampleIDlist,
Clade=rep(c("1","2", "3", "4", "5", "6"), 107),
n=0,
propClade=0)
gardRelativeAbundance0 <- gardRelativeAbundance %>%
full_join(cladeFrame) %>%
group_by(SampleID, Clade) %>%
filter(n==max(n)) %>%
ungroup() %>%
select(-n) %>%
spread(Clade, propClade) %>%
mutate(Clades=case_when((`1`>0 |`2`>0) & `3`==0 & `4`==0 & `5`==0 & `6`==0 ~ "1_2",
`1`==0 & `2`==0 & (`3`>0 | `4`>0) & `5`==0 & `6`==0 ~ "3_4",
`1`==0 & `2`==0 & `3`==0 & `4`==0 & (`5`>0 | `6`>0) ~ "5_6",
`1` > 0 | `2` > 0 | `3`>0 | `4`>0 | `5`>0 | `6`>0 ~ "mix",
`1`==0 & `2` ==0 & `3`==0 & `4`==0 & `5`==0 & `6`==0 ~"none")) %>%
left_join(metaphlanGard, by="SampleID") %>%
left_join(unique.data.frame(sgDF[,c("SampleID", "SubjectID", "TermPre", "GWdell", "Cohort", "GWcoll")])) %>%
mutate(SubjectID=as.character(SubjectID))
## Joining, by = c("SampleID", "Clade", "n", "propClade")
## Joining, by = "SampleID"
Humann2 Output tables
#humann2 outs
dataDir <- "/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables"
GF_path <- file.path(dataDir, "VM_genefamilies_names_cpm.tsv")
GFP_path <- file.path(dataDir, "VM_genefamilies_paths_cpm.tsv")
GOnames_path <- file.path(dataDir, "map_go_name.txt")
GFGO_path <- file.path(dataDir, "map_go_uniref90.txt")
PWnames_path <- file.path(dataDir, "map_metacyc-pwy_name.txt")
PC_path <- file.path(dataDir, "VM_pathcoverage.tsv")
PA_path <- file.path(dataDir, "VM_pathabundance_cpm.tsv")
# read in data frames
GF <- read_delim(GF_path, delim = "\t") # Gene family abundances
## Parsed with column specification:
## cols(
## .default = col_double(),
## `# Gene Family` = col_character()
## )
## See spec(...) for full column specifications.
GFP <- read_delim(GFP_path, delim = "\t") # pathways for each gene family
## Parsed with column specification:
## cols(
## .default = col_character(),
## `1000801248_Abundance` = col_double()
## )
## See spec(...) for full column specifications.
## Warning: 1662 parsing failures.
## row col expected actual file
## 1 -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
## 2 -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
## 3 -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
## 4 -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
## 5 -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
## ... ... ........... ......... .....................................................................................................................
## See problems(...) for more details.
PWnames <- read_delim(PWnames_path, delim = "\t", col_names = c("Pathway", "pathwayAnnotation"))
## Parsed with column specification:
## cols(
## Pathway = col_character(),
## pathwayAnnotation = col_character()
## )
PC <- read_delim(PC_path, delim = "\t") # pathway coverage
## Parsed with column specification:
## cols(
## .default = col_double(),
## `# Pathway` = col_character()
## )
## See spec(...) for full column specifications.
PA <- read_delim(PA_path, delim = "\t") # pathway abundnance
## Parsed with column specification:
## cols(
## .default = col_double(),
## `# Pathway` = col_character()
## )
## See spec(...) for full column specifications.
GOnames <- read_delim(GOnames_path, delim = "\t", col_names = c("GO", "Function")) # names and annotations of GO categories
## Parsed with column specification:
## cols(
## GO = col_character(),
## Function = col_character()
## )
GFGO <- read_lines(GFGO_path) %>% # read in the file as a list
str_split("\t")
GFP0 <- GFP %>%
select(`# Pathway`) %>% # keep only pathway and gene family names
filter(str_detect(`# Pathway`, "Gardnerella")) %>% # keep only Gardnerella
separate(`# Pathway`, c("Pathway", "GeneFamily"), sep ="\\|g__Gardnerella.s__Gardnerella_vaginalis\\|", fill = "right") %>% # separate
separate(GeneFamily, c("uniref", "unirefAnnotation"), ": ") %>%
filter(uniref != "") # remove empty pathways
head(GFP0)
## # A tibble: 6 x 3
## Pathway uniref unirefAnnotation
## <chr> <chr> <chr>
## 1 COA-PWY-1 UniRef90_I4LNX4 Dephospho-CoA kinase
## 2 COA-PWY-1 UniRef90_D2R9V1 Phosphopantetheine adenylyltransferase
## 3 COA-PWY-1 UniRef90_D2RAK6 Dephospho-CoA kinase
## 4 COA-PWY-1 UniRef90_D2RBT1 Type III pantothenate kinase
## 5 NONOXIPENT-PWY UniRef90_E3D9X8 Transaldolase
## 6 NONOXIPENT-PWY UniRef90_D6SZM2 Transaldolase
GOcategories <- map(GFGO, `[[`, 1) %>% # get the GO category labels
unlist()
GFGO0 <- map2(GOcategories, GFGO, ~paste(.x, .y, sep=",")) %>% # paste a separable (important for later making a dataframe) GO category to each gene family in each category
unlist() %>% # collapse list
as_tibble() %>% # make dataframe
separate(1, c("GO", "uniref"), ",") %>% #separate into columns
filter(!str_detect(uniref, "GO:[0-9]*$")) # remove rows with GO category label in uniref column
## Warning: Calling `as_tibble()` on a vector is discouraged, because the behavior is likely to change in the future. Use `tibble::enframe(name = NULL)` instead.
## This warning is displayed once per session.
head(GFGO0)
## # A tibble: 6 x 2
## GO uniref
## <chr> <chr>
## 1 GO:0000001 UniRef90_A0A016PS51
## 2 GO:0000001 UniRef90_A1C8D3
## 3 GO:0000001 UniRef90_A1CBR9
## 4 GO:0000001 UniRef90_A1CE31
## 5 GO:0000001 UniRef90_A1CNY1
## 6 GO:0000001 UniRef90_A1CPB1
GF0 <- GF %>%
filter(str_detect(`# Gene Family`, "Gardnerella")) %>%
as.data.frame()
rownames(GF0) <- GF0[,1]
GF1 <- GF0 %>%
select(-`# Gene Family`)
GF2 <- GF1 %>%
t() %>%
as.data.frame(strinsAsFactors=FALSE) %>%
mutate(SampleID=str_extract(colnames(GF1), "[0-9]{10}")) %>%
gather("GeneFamily", "Abundance", contains("Gardnerella")) %>%
mutate(GeneFamily=str_extract(GeneFamily, ".*(?=.g__Gardnerella)")) %>%
separate(GeneFamily, c("uniref", "unirefAnnotation"), ": ", fill = "right") %>%
left_join(gardRelativeAbundance0[,c("SampleID", "SubjectID", "TermPre", "GWdell", "GWcoll", "Cohort", "Clades")], by="SampleID")
head(GF2)
## SampleID uniref unirefAnnotation Abundance SubjectID TermPre
## 1 1000801248 UniRef90_unknown <NA> 293.681 10008 T
## 2 1000801318 UniRef90_unknown <NA> 168.545 10008 T
## 3 1000801368 UniRef90_unknown <NA> 262.059 10008 T
## 4 1001301158 UniRef90_unknown <NA> 5946.180 10013 P
## 5 1001301248 UniRef90_unknown <NA> 11876.100 10013 P
## 6 1001301318 UniRef90_unknown <NA> 807.692 10013 P
## GWdell GWcoll Cohort Clades
## 1 41 25 Stanford 3_4
## 2 41 32 Stanford 3_4
## 3 41 37 Stanford 3_4
## 4 35 16 Stanford mix
## 5 35 25 Stanford mix
## 6 35 31 Stanford mix
PA0 <- PA %>%
filter(str_detect(`# Pathway`, "Gardnerella")) %>%
as.data.frame()
rownames(PA0) <- PA0[,1]
PA1 <- PA0 %>%
select(-`# Pathway`)
PA2 <- PA1 %>%
t() %>%
as.data.frame(strinsAsFactors=FALSE) %>%
mutate(SampleID=str_extract(colnames(PA1), "[0-9]{10}")) %>%
gather("Pathway", "Abundance", contains("Gardnerella")) %>%
mutate(Pathway=str_extract(Pathway, ".*(?=.g__Gardnerella)")) %>%
left_join(gardRelativeAbundance0[,c("SampleID", "SubjectID", "TermPre", "GWdell", "GWcoll", "Cohort", "Clades")], by="SampleID")
head(PA2)
## SampleID Pathway Abundance SubjectID TermPre GWdell GWcoll
## 1 1000801248 UNINTEGRATED 5507.21 10008 T 41 25
## 2 1000801318 UNINTEGRATED 3295.92 10008 T 41 32
## 3 1000801368 UNINTEGRATED 4797.42 10008 T 41 37
## 4 1001301158 UNINTEGRATED 73881.90 10013 P 35 16
## 5 1001301248 UNINTEGRATED 174983.00 10013 P 35 25
## 6 1001301318 UNINTEGRATED 15372.60 10013 P 35 31
## Cohort Clades
## 1 Stanford 3_4
## 2 Stanford 3_4
## 3 Stanford 3_4
## 4 Stanford mix
## 5 Stanford mix
## 6 Stanford mix
PC0 <- PC %>%
filter(str_detect(`# Pathway`, "Gardnerella")) %>%
as.data.frame()
rownames(PC0) <- PC0[,1]
PC1 <- PC0 %>%
select(-`# Pathway`)
PC2 <- PC1 %>%
t() %>%
as.data.frame(strinsAsFactors=FALSE) %>%
mutate(SampleID=str_extract(colnames(PC1), "[0-9]{10}")) %>%
gather("Pathway", "Coverage", contains("Gardnerella")) %>%
mutate(Pathway=str_extract(Pathway, ".*(?=.g__Gardnerella)")) %>%
left_join(gardRelativeAbundance0[,c("SampleID", "SubjectID", "TermPre", "GWdell", "GWcoll", "Cohort", "Clades")], by="SampleID")
head(PC2)
## SampleID Pathway Coverage SubjectID TermPre GWdell GWcoll
## 1 1000801248 UNINTEGRATED 1 10008 T 41 25
## 2 1000801318 UNINTEGRATED 1 10008 T 41 32
## 3 1000801368 UNINTEGRATED 1 10008 T 41 37
## 4 1001301158 UNINTEGRATED 1 10013 P 35 16
## 5 1001301248 UNINTEGRATED 1 10013 P 35 25
## 6 1001301318 UNINTEGRATED 1 10013 P 35 31
## Cohort Clades
## 1 Stanford 3_4
## 2 Stanford 3_4
## 3 Stanford 3_4
## 4 Stanford mix
## 5 Stanford mix
## 6 Stanford mix
geneFamilies <- unique(paste(GF2$uniref, GF2$unirefAnnotation, sep = ": "))
GFfoldChange <- GF2 %>%
filter(Clades=="1_2"|Clades=="3_4") %>%
group_by(uniref, unirefAnnotation, Clades, Cohort) %>%
summarise(mean(Abundance)) %>%
ungroup() %>%
spread(Clades, `mean(Abundance)`) %>%
filter(Cohort=="UAB"|Cohort=="Stanford",
`1_2`>0,
`3_4`>0) %>%
mutate(FoldChange=case_when(`1_2`>`3_4`~`1_2`/`3_4`,
`3_4`>`1_2`~-`3_4`/`1_2`)) #%>%
# left_join(GFGO0, by="uniref") %>% # for adding GO information to the dataframe. removed for now as multiple GO categories were assigned to many uniref
# left_join(GOnames, by= "GO")
labels1 <- GFfoldChange[order(GFfoldChange$FoldChange),] %>%
select(uniref, unirefAnnotation, Cohort, FoldChange)
labels1 <- labels1[1:10,]
labels2 <- GFfoldChange[order(-GFfoldChange$FoldChange),] %>%
select(uniref, unirefAnnotation, Cohort, FoldChange)
labels2 <- labels2[1:10,]
labels <- rbind(labels1, labels2) %>%
filter(unirefAnnotation!="NO_NAME")
ggplot(GFfoldChange, aes(x=uniref, y=FoldChange, label = unirefAnnotation)) +
geom_point() +
theme(axis.text.x = element_blank()) +
ggrepel::geom_text_repel(data=labels) +
labs(y=element_blank()) +
facet_wrap(~Cohort) +
annotate("text", label = "Positive=greater in 1/2", x=200, y = 6000)
## Gene Families with Pathway information
GFfoldChangePath <- GFfoldChange %>%
left_join(GFP0, by=c("uniref", "unirefAnnotation")) %>%
filter(!is.na(Pathway)) %>%
left_join(PWnames, by="Pathway")
ggplot(GFfoldChangePath, aes(x=uniref, y=FoldChange, color = paste(Pathway, pathwayAnnotation), label=unirefAnnotation)) +
geom_text(size=2) +
theme(axis.text.x = element_blank()) +
labs(y=element_blank(), color="Pathway") +
facet_wrap(~Cohort)
List of greatest fold change uniref gene families (pathways not annotated for greatest fold change uniref genes)
GFfoldChange %>%
filter(FoldChange>=2000|FoldChange<=-1000) %>%
print(n = nrow(.), width = getOption("tibble.width"), n_extra = NULL)
## # A tibble: 83 x 6
## uniref unirefAnnotation Cohort `1_2` `3_4` FoldChange
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 UniRef90_… UDP-N-acetylenolpyruvoylgl… Stanf… 2.99e-3 4.49e+0 -1501.
## 2 UniRef90_… Ribonuclease HI Stanf… 3.50e-3 4.65e+0 -1325.
## 3 UniRef90_… HTH domain protein Stanf… 3.36e-3 4.14e+0 -1233.
## 4 UniRef90_… Signal recognition particl… Stanf… 4.41e-3 4.57e+0 -1036.
## 5 UniRef90_… NO_NAME Stanf… 2.93e-3 4.17e+0 -1423.
## 6 UniRef90_… Drug resistance MFS transp… Stanf… 2.32e-3 4.43e+0 -1911.
## 7 UniRef90_… Universal stress family pr… Stanf… 3.83e-3 5.03e+0 -1313.
## 8 UniRef90_… Sortase family protein Stanf… 3.58e-3 4.20e+0 -1172.
## 9 UniRef90_… Cell division protein FtsZ Stanf… 3.15e-3 4.11e+0 -1308.
## 10 UniRef90_… Cell envelope-like functio… Stanf… 2.49e-3 3.73e+0 -1499.
## 11 UniRef90_… Mg chelatase-like protein Stanf… 2.25e-3 3.70e+0 -1646.
## 12 UniRef90_… RNA polymerase sigma facto… Stanf… 2.54e-3 3.36e+0 -1324.
## 13 UniRef90_… NO_NAME Stanf… 3.61e-3 7.99e+0 -2211.
## 14 UniRef90_… Allantoate amidohydrolase Stanf… 3.03e-3 3.38e+0 -1114.
## 15 UniRef90_… Tetratricopeptide repeat p… Stanf… 1.02e-3 1.75e+0 -1722.
## 16 UniRef90_… Aminotransferase, class I/… Stanf… 3.23e-3 3.47e+0 -1075.
## 17 UniRef90_… Na(+)/H(+) antiporter NhaA Stanf… 6.04e-3 1.09e+1 -1799.
## 18 UniRef90_… Carbamoyl-phosphate syntha… Stanf… 3.16e-3 3.85e+0 -1218.
## 19 UniRef90_… Dihydroorotase Stanf… 2.66e-3 3.88e+0 -1463.
## 20 UniRef90_… ABC transporter, permease … Stanf… 3.96e-3 4.24e+0 -1070.
## 21 UniRef90_… DivIVA domain protein Stanf… 2.84e-3 4.58e+0 -1611.
## 22 UniRef90_… NO_NAME Stanf… 3.27e-3 3.94e+0 -1203.
## 23 UniRef90_… ABC transporter, permease … Stanf… 4.20e-3 4.94e+0 -1175.
## 24 UniRef90_… Phosphoenolpyruvate-protei… Stanf… 4.09e-3 4.49e+0 -1098.
## 25 UniRef90_… NO_NAME Stanf… 4.64e-3 5.17e+0 -1115.
## 26 UniRef90_… Transcriptional regulator,… Stanf… 3.87e-3 4.53e+0 -1169.
## 27 UniRef90_… Alkyl hydroperoxide reduct… Stanf… 4.40e-3 4.80e+0 -1091.
## 28 UniRef90_… Heavy metal translocating … Stanf… 3.83e-3 4.97e+0 -1297.
## 29 UniRef90_… Cell cycle protein, FtsW/R… Stanf… 5.04e-3 5.96e+0 -1182.
## 30 UniRef90_… Protein phosphatase 2C Stanf… 4.43e-3 5.77e+0 -1303.
## 31 UniRef90_… Ribosomal RNA small subuni… Stanf… 4.49e-3 6.00e+0 -1337.
## 32 UniRef90_… Kinase domain protein Stanf… 3.97e-3 5.49e+0 -1382.
## 33 UniRef90_… MATE efflux family protein Stanf… 2.77e-3 4.91e+0 -1775.
## 34 UniRef90_… Chloride transporter, ClC … Stanf… 2.75e-3 5.47e+0 -1987.
## 35 UniRef90_… DNA polymerase III, subuni… Stanf… 2.98e-3 4.94e+0 -1661.
## 36 UniRef90_… Aspartate-semialdehyde deh… Stanf… 3.65e-3 4.72e+0 -1291.
## 37 UniRef90_… NO_NAME Stanf… 3.03e-3 4.26e+0 -1405.
## 38 UniRef90_… Efflux ABC transporter, pe… Stanf… 2.53e-3 4.60e+0 -1821.
## 39 UniRef90_… Subtilisin-like serine pro… Stanf… 2.14e-3 2.21e+0 -1033.
## 40 UniRef90_… dGTP triphosphohydrolase Stanf… 2.96e-3 4.03e+0 -1364.
## 41 UniRef90_… NO_NAME Stanf… 3.52e-3 4.17e+0 -1183.
## 42 UniRef90_… NO_NAME Stanf… 5.78e-4 1.03e+0 -1775.
## 43 UniRef90_… Subtilisin-like serine pro… Stanf… 2.29e-3 3.48e+0 -1522.
## 44 UniRef90_… Conserved uncharacterized … Stanf… 4.17e-3 4.30e+0 -1031.
## 45 UniRef90_… Chromosome segregation ATP… Stanf… 5.64e-4 5.77e-1 -1022.
## 46 UniRef90_… NO_NAME Stanf… 2.63e+1 1.24e-2 2115.
## 47 UniRef90_… NO_NAME Stanf… 2.59e+1 1.23e-2 2109.
## 48 UniRef90_… Beta galactosidase small c… Stanf… 2.64e+1 7.29e-3 3616.
## 49 UniRef90_… Aminopeptidase P domain pr… Stanf… 2.18e+1 7.60e-3 2866.
## 50 UniRef90_… Protein phosphatase 2C Stanf… 2.61e+1 7.11e-3 3674.
## 51 UniRef90_… Cell cycle protein, FtsW/R… Stanf… 2.59e+1 8.60e-3 3009.
## 52 UniRef90_… NO_NAME Stanf… 2.56e+1 9.29e-3 2758.
## 53 UniRef90_… N-acetylglucosamine-6-phos… Stanf… 2.80e+1 9.87e-3 2836.
## 54 UniRef90_… Acyltransferase Stanf… 2.04e+1 9.61e-3 2122.
## 55 UniRef90_… Transcriptional regulator,… Stanf… 2.67e+1 1.27e-2 2095.
## 56 UniRef90_… ABC transporter, solute-bi… Stanf… 2.73e+1 1.02e-2 2691.
## 57 UniRef90_… ABC transporter, solute-bi… Stanf… 2.02e+1 8.84e-3 2283.
## 58 UniRef90_… Glycosyl hydrolase, family… Stanf… 1.34e+1 4.12e-3 3262.
## 59 UniRef90_… Actinobacterial surface-an… Stanf… 1.92e+1 8.84e-3 2170.
## 60 UniRef90_… Homoserine kinase Stanf… 2.62e+1 1.24e-2 2108.
## 61 UniRef90_… Septum formation protein M… Stanf… 2.52e+1 8.54e-3 2949.
## 62 UniRef90_… DivIVA domain repeat prote… Stanf… 1.79e+1 7.96e-3 2252.
## 63 UniRef90_… Dihydroorotase Stanf… 1.81e+1 8.67e-3 2085.
## 64 UniRef90_… NO_NAME Stanf… 1.82e+1 6.18e-3 2942.
## 65 UniRef90_… Tetratricopeptide repeat p… Stanf… 1.80e+1 6.97e-3 2579.
## 66 UniRef90_… Bifunctional purine biosyn… Stanf… 1.88e+1 7.15e-3 2623.
## 67 UniRef90_… Replicative DNA helicase Stanf… 2.53e+1 8.27e-3 3065.
## 68 UniRef90_… NO_NAME Stanf… 2.61e+1 9.40e-3 2775.
## 69 UniRef90_… Transporter, CPA2 family Stanf… 2.48e+1 1.15e-2 2168.
## 70 UniRef90_… Cyclomaltodextrinase Stanf… 1.93e+1 6.79e-3 2847.
## 71 UniRef90_… Cell envelope-like functio… Stanf… 1.91e+1 8.40e-3 2277.
## 72 UniRef90_… Alkyl hydroperoxide reduct… Stanf… 1.87e+1 7.41e-3 2527.
## 73 UniRef90_… AMP-binding enzyme Stanf… 1.83e+1 6.59e-3 2785.
## 74 UniRef90_… NO_NAME Stanf… 1.47e+1 4.79e-3 3062.
## 75 UniRef90_… NO_NAME Stanf… 7.73e+0 3.53e-3 2191.
## 76 UniRef90_… Primosome assembly protein… Stanf… 1.69e+1 4.77e-3 3544.
## 77 UniRef90_… FtsW-like protein Stanf… 1.97e+1 9.29e-3 2123.
## 78 UniRef90_… Cation-transporting ATPase Stanf… 2.67e-3 3.04e+0 -1142.
## 79 UniRef90_… Deoxyguanosinetriphosphate… Stanf… 1.95e+1 9.67e-3 2012.
## 80 UniRef90_… Dihydroorotate dehydrogena… Stanf… 3.85e-3 4.58e+0 -1188.
## 81 UniRef90_… NO_NAME Stanf… 7.92e+0 2.64e-3 2999.
## 82 UniRef90_… UvrD/REP helicase Stanf… 1.62e+1 2.70e-3 6006.
## 83 UniRef90_… NO_NAME Stanf… 1.77e+1 8.21e-3 2162.
alternatePresence <- GF2 %>%
group_by(uniref, unirefAnnotation, Clades, Cohort) %>%
summarise(mean(Abundance)) %>%
ungroup() %>%
spread(Clades, `mean(Abundance)`) %>%
filter(Cohort=="UAB"|Cohort=="Stanford",
`1_2`==0|`3_4`==0,
!(`1_2`==0&`3_4`==0)) %>%
mutate(`3_4`=-`3_4`) %>%
gather("Clades", "Abundance", c(`1_2`, `3_4`)) %>% #make abundance one column instead of separate T and P columsn
filter(Abundance!=0) %>% # get rid of duplicate entries where abundance = 0
left_join(GFP0, by=c("uniref", "unirefAnnotation")) %>%
left_join(PWnames, by="Pathway")
ggplot(alternatePresence, aes(x=reorder(uniref, Abundance), y=Abundance))+
geom_col() +
theme(axis.text.x = element_blank()) +
labs(x="Uniref90", y="Average reads per million") +
facet_wrap(~Cohort) +
annotate("text", label = "Positive=greater in 1/2", x=300, y = 400)
# remove bulk of the figure with little variation
alternatePresence %>%
filter(Abundance>=80|Abundance<=-30) %>%
ggplot(aes(x=reorder(uniref, Abundance), y=Abundance)) +
geom_col() +
theme(axis.text.x = element_blank()) +
labs(x="Uniref90", y="Average reads per million") +
facet_wrap(~Cohort) +
annotate("text", label = "Positive=greater in 1/2", x=3, y = 400)
Save list of alternately present genes and show list of greatest differences
alternatePresence %>%
filter(Abundance>=800|Abundance<=-30) %>%
arrange(-Abundance) %>%
print(n = nrow(.), width = getOption("tibble.width"), n_extra = NULL)
## # A tibble: 11 x 10
## uniref unirefAnnotation Cohort `5_6` mix none Clades Abundance
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 UniRe… NO_NAME Stanf… 0 170. 0 3_4 -32.2
## 2 UniRe… NO_NAME Stanf… 0 491. 0 3_4 -33.8
## 3 UniRe… NO_NAME Stanf… 0 1.81 0 3_4 -33.8
## 4 UniRe… NO_NAME Stanf… 0 44.8 0 3_4 -35.6
## 5 UniRe… NO_NAME Stanf… 0 126. 0 3_4 -39.6
## 6 UniRe… NO_NAME Stanf… 0 290. 0 3_4 -40.8
## 7 UniRe… NO_NAME Stanf… 0 234. 0 3_4 -49.2
## 8 UniRe… NO_NAME Stanf… 0 409. 0 3_4 -57.7
## 9 UniRe… NO_NAME Stanf… 0 412. 0 3_4 -68.7
## 10 UniRe… NO_NAME Stanf… 0 218. 0 3_4 -74.4
## 11 UniRe… NO_NAME Stanf… 0 499. 0 3_4 -84.9
## # … with 2 more variables: Pathway <chr>, pathwayAnnotation <chr>
Need to continure to assess gene function and pathway, and also look at distribution across samples of each of these genes.
Assess reads per million in each annotated pathway (MetaCyc) from Gardnerella vaginalis sample across in term versus preterm birth. Cumulative reads in these pathways attributed to Gardnerella vaginalis determined by Humann2.
pathways <- unique(PA2$Pathway)
pathways
## [1] "UNINTEGRATED"
## [2] "COA-PWY-1: coenzyme A biosynthesis II (mammalian)"
## [3] "NONOXIPENT-PWY: pentose phosphate pathway (non-oxidative branch)"
## [4] "PENTOSE-P-PWY: pentose phosphate pathway"
## [5] "PEPTIDOGLYCANSYN-PWY: peptidoglycan biosynthesis I (meso-diaminopimelate containing)"
## [6] "PWY-5100: pyruvate fermentation to acetate and lactate II"
## [7] "PWY-5686: UMP biosynthesis"
## [8] "PWY-6122: 5-aminoimidazole ribonucleotide biosynthesis II"
## [9] "PWY-6151: S-adenosyl-L-methionine cycle I"
## [10] "PWY-6277: superpathway of 5-aminoimidazole ribonucleotide biosynthesis"
## [11] "PWY-6385: peptidoglycan biosynthesis III (mycobacteria)"
## [12] "PWY-6386: UDP-N-acetylmuramoyl-pentapeptide biosynthesis II (lysine-containing)"
## [13] "PWY-6387: UDP-N-acetylmuramoyl-pentapeptide biosynthesis I (meso-diaminopimelate containing)"
## [14] "PWY-7219: adenosine ribonucleotides de novo biosynthesis"
## [15] "PWY-7221: guanosine ribonucleotides de novo biosynthesis"
## [16] "THRESYN-PWY: superpathway of L-threonine biosynthesis"
map(pathways, ~filter(PA2, Pathway==.x) %>%
ggplot(., aes(x=Clades, y=Abundance, color=Clades)) +
geom_jitter() +
geom_boxplot(alpha=0) +
scale_y_log10() +
labs(title=.x, x=element_blank(), y="Abundance (reads per million") +
#scale_x_discrete(labels = c("Preterm", "Term")) +
#scale_color_manual(values=c("#56B4E9", "#D55E00")) +
theme(legend.position = "none") +
facet_wrap(~Cohort))
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
Need to find cohort information of samples of unkown cohort. Some differences apparent between Stanford and UAB cohorts. of note is that no PTB samples have reads from the peptidoglycan biosynthesis III pathway in the Stanford cohort and only 3 Stanford term samples had reads from this pathway.
PC3 <- PC2 %>%
filter(Pathway!="UNINTEGRATED") # covereage value of "unintegrated" is meaningless
map(pathways[2:length(pathways)], ~filter(PC3, Pathway==.x) %>% # remove unintegrated from pathways list.
ggplot(., aes(x=Clades, y=Coverage, color=Clades)) +
geom_jitter() +
geom_boxplot(alpha=0) +
labs(title=.x, x=element_blank()) +
#scale_x_discrete(labels = c("Preterm", "Term")) +
ylim(0,1) +
#scale_color_manual(values=c("#56B4E9", "#D55E00")) +
theme(legend.position = "none") +
facet_wrap(~Cohort))
## [[1]]
## Warning: Removed 25 rows containing missing values (geom_point).
##
## [[2]]
## Warning: Removed 50 rows containing missing values (geom_point).
##
## [[3]]
## Warning: Removed 43 rows containing missing values (geom_point).
##
## [[4]]
## Warning: Removed 28 rows containing missing values (geom_point).
##
## [[5]]
## Warning: Removed 23 rows containing missing values (geom_point).
##
## [[6]]
## Warning: Removed 15 rows containing missing values (geom_point).
##
## [[7]]
## Warning: Removed 14 rows containing missing values (geom_point).
##
## [[8]]
## Warning: Removed 17 rows containing missing values (geom_point).
##
## [[9]]
## Warning: Removed 14 rows containing missing values (geom_point).
##
## [[10]]
## Warning: Removed 52 rows containing missing values (geom_point).
##
## [[11]]
## Warning: Removed 24 rows containing missing values (geom_point).
##
## [[12]]
## Warning: Removed 20 rows containing missing values (geom_point).
##
## [[13]]
## Warning: Removed 23 rows containing missing values (geom_point).
##
## [[14]]
## Warning: Removed 20 rows containing missing values (geom_point).
##
## [[15]]
## Warning: Removed 18 rows containing missing values (geom_point).
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] phyloseq_1.26.1 ggrepel_0.8.1 forcats_0.4.0 stringr_1.4.0
## [5] dplyr_0.8.0.1 purrr_0.3.2 readr_1.3.1 tidyr_0.8.3
## [9] tibble_2.1.1 ggplot2_3.1.1 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] Biobase_2.42.0 httr_1.4.0 jsonlite_1.6
## [4] splines_3.5.2 foreach_1.4.4 modelr_0.1.4
## [7] assertthat_0.2.1 stats4_3.5.2 cellranger_1.1.0
## [10] yaml_2.2.0 pillar_1.3.1 backports_1.1.4
## [13] lattice_0.20-38 glue_1.3.1 digest_0.6.18
## [16] XVector_0.22.0 rvest_0.3.3 colorspace_1.4-1
## [19] htmltools_0.3.6 Matrix_1.2-17 plyr_1.8.4
## [22] pkgconfig_2.0.2 broom_0.5.2 haven_2.1.0
## [25] zlibbioc_1.28.0 scales_1.0.0 mgcv_1.8-28
## [28] generics_0.0.2 IRanges_2.16.0 withr_2.1.2
## [31] BiocGenerics_0.28.0 lazyeval_0.2.2 cli_1.1.0
## [34] survival_2.44-1.1 magrittr_1.5 crayon_1.3.4
## [37] readxl_1.3.1 evaluate_0.13 fansi_0.4.0
## [40] nlme_3.1-139 MASS_7.3-51.4 xml2_1.2.0
## [43] vegan_2.5-4 tools_3.5.2 data.table_1.12.2
## [46] hms_0.4.2 Rhdf5lib_1.4.3 S4Vectors_0.20.1
## [49] munsell_0.5.0 cluster_2.0.8 Biostrings_2.50.2
## [52] ade4_1.7-13 compiler_3.5.2 rlang_0.3.4
## [55] rhdf5_2.26.2 grid_3.5.2 iterators_1.0.10
## [58] biomformat_1.10.1 rstudioapi_0.10 igraph_1.2.4
## [61] labeling_0.3 rmarkdown_1.12 gtable_0.3.0
## [64] codetools_0.2-16 multtest_2.38.0 reshape2_1.4.3
## [67] R6_2.4.0 lubridate_1.7.4 knitr_1.22
## [70] utf8_1.1.4 permute_0.9-5 ape_5.3
## [73] stringi_1.4.3 parallel_3.5.2 Rcpp_1.0.1
## [76] tidyselect_0.2.5 xfun_0.6